package com.gedi.data.dataclean;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import javax.xml.soap.Text;
import java.io.IOException;

/**
 * @Author:RenPu
 * @Date: 2020/1/19 15:28
 * @Version: 1.0
 * @description:
 */
public class DataCleanClientApplication {

    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {

        //初始化Job对象
        Job job = Job.getInstance(new Configuration(), "dataClean");

        //设置文件的读取格式以及输出格式
        job.setInputFormatClass(TextInputFormat.class);
        job.setOutputFormatClass(TextOutputFormat.class);

        //设置文件读取以及输出路径
        TextInputFormat.setInputPaths(job,new Path("file:///d:/dataclean.log"));
        TextOutputFormat.setOutputPath(job,new Path("file:///d:/dataClean"));


        //设置根据那个mapper对象进行统计
        job.setMapperClass(DataCleanMapper.class);

        //设置map输出key以及value的数据类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);


        //设置reduce任务的个数,一般默认值为1，数据清洗一般没有reduce阶段，所以设置reduce的任务个数为0；
        job.setNumReduceTasks(0);

        //记录日志功能开启
        job.waitForCompletion(true);


    }
}
